From 0068fb5745870c50ea428294f7ecd3dcf733eaf7 Mon Sep 17 00:00:00 2001
From: Daniel Sabo <DanielSabo@gmail.com>
Date: Sun, 27 Dec 2015 07:29:55 -0800
Subject: [PATCH] Add support for hardware half<->float conversions

These instructions require a Ivy Bridge or newer processor, so I've only
been able to test them under the Intel Software Development Emulator.
---
 babl/babl-cpuaccel.c   |   6 +-
 babl/babl-cpuaccel.h   |   3 +
 configure.ac           |  22 ++++
 extensions/Makefile.am |   3 +
 extensions/sse-half.c  | 270 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 extensions/sse-half.c

diff --git a/babl/babl-cpuaccel.c b/babl/babl-cpuaccel.c
index 4e1683e..59fdcdd 100644
--- a/babl/babl-cpuaccel.c
+++ b/babl/babl-cpuaccel.c
@@ -118,7 +118,8 @@ enum
   ARCH_X86_INTEL_FEATURE_SSSE3    = 1 << 9,
   ARCH_X86_INTEL_FEATURE_SSE4_1   = 1 << 19,
   ARCH_X86_INTEL_FEATURE_SSE4_2   = 1 << 20,
-  ARCH_X86_INTEL_FEATURE_AVX      = 1 << 28
+  ARCH_X86_INTEL_FEATURE_AVX      = 1 << 28,
+  ARCH_X86_INTEL_FEATURE_F16C     = 1 << 29,
 };
 
 #if !defined(ARCH_X86_64) && (defined(PIC) || defined(__PIC__))
@@ -244,6 +245,9 @@ arch_accel_intel (void)
 
     if (ecx & ARCH_X86_INTEL_FEATURE_SSE4_1)
       caps |= BABL_CPU_ACCEL_X86_SSE4_1;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_F16C)
+      caps |= BABL_CPU_ACCEL_X86_F16C;
 #endif /* USE_SSE */
   }
 #endif /* USE_MMX */
diff --git a/babl/babl-cpuaccel.h b/babl/babl-cpuaccel.h
index 57eb118..8040d73 100644
--- a/babl/babl-cpuaccel.h
+++ b/babl/babl-cpuaccel.h
@@ -32,6 +32,9 @@ typedef enum
   BABL_CPU_ACCEL_X86_SSE3    = 0x02000000,
   BABL_CPU_ACCEL_X86_SSSE3   = 0x00800000,
   BABL_CPU_ACCEL_X86_SSE4_1  = 0x00400000,
+  /* BABL_CPU_ACCEL_X86_SSE4_2  = 0x00200000, */
+  /* BABL_CPU_ACCEL_X86_AVX     = 0x00080000, */
+  BABL_CPU_ACCEL_X86_F16C    = 0x00040000,
 
   /* powerpc accelerations */
   BABL_CPU_ACCEL_PPC_ALTIVEC = 0x04000000,
diff --git a/configure.ac b/configure.ac
index f09c7ac..28e9af0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -303,6 +303,10 @@ AC_ARG_ENABLE(sse4_1,
   [  --enable-sse4_1            enable SSE4_1 support (default=auto)],,
   enable_sse4_1=$enable_sse)
 
+AC_ARG_ENABLE(f16c,
+  [  --enable-f16c            enable hardware half-float support (default=auto)],,
+  enable_f16c=$enable_sse)
+
 if test "x$enable_mmx" = xyes; then
   BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx')
   SSE_EXTRA_CFLAGS=
@@ -378,6 +382,24 @@ if test "x$enable_mmx" = xyes; then
         fi
       fi
 
+      if test "x$enable_f16c" = xyes; then
+        BABL_DETECT_CFLAGS(f16c_flag, '-mf16c')
+        SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $f16c_flag"
+
+        AC_MSG_CHECKING(whether we can compile half-floating point code)
+
+        CFLAGS="$CFLAGS $sse_flag $f16c_flag"
+
+        AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],[_mm_cvtph_ps ((__m128i)_mm_setzero_ps());])],
+          AC_DEFINE(USE_F16C, 1, [Define to 1 if f16c intrinsics are available.])
+          AC_MSG_RESULT(yes)
+        ,
+          enable_f16c=no
+          AC_MSG_RESULT(no)
+          AC_MSG_WARN([The compiler does not support f16c intrinsics.])
+        )
+      fi
+
     fi
   ,
     enable_mmx=no
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index cd7e893..c06aa8f 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -32,6 +32,7 @@ ext_LTLIBRARIES = \
 	sse2-int8.la    \
 	sse2-int16.la   \
 	sse4-int8.la    \
+	sse-half.la     \
 	two-table.la	\
 	ycbcr.la
 
@@ -50,6 +51,7 @@ sse2_float_la_SOURCES = sse2-float.c
 sse2_int8_la_SOURCES = sse2-int8.c
 sse2_int16_la_SOURCES = sse2-int16.c
 sse4_int8_la_SOURCES = sse4-int8.c
+sse_half_la_SOURCES = sse-half.c
 two_table_la_SOURCES = two-table.c two-table-tables.h
 ycbcr_la_SOURCES = ycbcr.c
 float_la_SOURCES = float.c
@@ -62,3 +64,4 @@ sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS)
+sse_half_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) $(F16C_EXTRA_CFLAGS)
diff --git a/extensions/sse-half.c b/extensions/sse-half.c
new file mode 100644
index 0000000..ca57ceb
--- /dev/null
+++ b/extensions/sse-half.c
@@ -0,0 +1,270 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2015 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE4_1) && defined(USE_F16C)
+
+#include <immintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+static inline long
+conv_yHalf_yF (const uint16_t *src, float *dst, long samples)
+{
+  const uint64_t *s_vec;
+  __v4sf         *d_vec;
+
+  long n = samples;
+
+  s_vec = (const uint64_t *)src;
+  d_vec = (__v4sf *)dst;
+
+  while (n >= 4)
+    {
+      __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
+      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
+      _mm_storeu_ps((float *)d_vec++, out_val);
+      n -= 4;
+    }
+
+  src = (const uint16_t *)s_vec;
+  dst = (float *)d_vec;
+
+  while (n)
+    {
+      __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
+      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
+      _mm_store_ss(dst++, out_val);
+      n -= 1;
+    }
+
+  return samples;
+}
+
+static long
+conv_yaHalf_yaF (const uint16_t *src, float *dst, long samples)
+{
+  return conv_yHalf_yF (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgbHalf_rgbF (const uint16_t *src, float *dst, long samples)
+{
+  return conv_yHalf_yF (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgbaHalf_rgbaF (const uint16_t *src, float *dst, long samples)
+{
+  return conv_yHalf_yF (src, dst, samples * 4) / 4;
+}
+
+static inline long
+conv_yF_yHalf (const float *src, uint16_t *dst, long samples)
+{
+  const __v4sf *s_vec;
+  uint64_t     *d_vec;
+
+  long n = samples;
+
+  s_vec = (const __v4sf *)src;
+  d_vec = (uint64_t *)dst;
+
+  while (n >= 4)
+    {
+      __m128 in_val = _mm_loadu_ps((float *)s_vec++);
+      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+      _mm_storel_epi64((__m128i *)d_vec++, out_val);
+      n -= 4;
+    }
+
+  src = (const float *)s_vec;
+  dst = (uint16_t *)d_vec;
+
+  while (n)
+    {
+      __m128 in_val = _mm_load_ss(src++);
+      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+      *dst++ = _mm_extract_epi16(out_val, 0);
+      n -= 1;
+    }
+
+  return samples;
+}
+
+static long
+conv_yaF_yaHalf (const float *src, uint16_t *dst, long samples)
+{
+  return conv_yF_yHalf (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgbF_rgbHalf (const float *src, uint16_t *dst, long samples)
+{
+  return conv_yF_yHalf (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgbaF_rgbaHalf (const float *src, uint16_t *dst, long samples)
+{
+  return conv_yF_yHalf (src, dst, samples * 4) / 4;
+}
+
+#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE4_1) && defined(USE_F16C)
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaHalf_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("half"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaF_gamma = babl_format_new (
+    babl_model ("R'G'B'A"),
+    babl_type ("float"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaHalf_gamma = babl_format_new (
+    babl_model ("R'G'B'A"),
+    babl_type ("half"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbF_linear = babl_format_new (
+    babl_model ("RGB"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    NULL);
+  const Babl *rgbHalf_linear = babl_format_new (
+    babl_model ("RGB"),
+    babl_type ("half"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    NULL);
+  const Babl *rgbF_gamma = babl_format_new (
+    babl_model ("R'G'B'"),
+    babl_type ("float"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    NULL);
+  const Babl *rgbHalf_gamma = babl_format_new (
+    babl_model ("R'G'B'"),
+    babl_type ("half"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    NULL);
+  const Babl *yaF_linear = babl_format_new (
+    babl_model ("YA"),
+    babl_type ("float"),
+    babl_component ("Y"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaHalf_linear = babl_format_new (
+    babl_model ("YA"),
+    babl_type ("half"),
+    babl_component ("Y"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaF_gamma = babl_format_new (
+    babl_model ("Y'A"),
+    babl_type ("float"),
+    babl_component ("Y'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaHalf_gamma = babl_format_new (
+    babl_model ("Y'A"),
+    babl_type ("half"),
+    babl_component ("Y'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yF_linear = babl_format_new (
+    babl_model ("Y"),
+    babl_type ("float"),
+    babl_component ("Y"),
+    NULL);
+  const Babl *yHalf_linear = babl_format_new (
+    babl_model ("Y"),
+    babl_type ("half"),
+    babl_component ("Y"),
+    NULL);
+  const Babl *yF_gamma = babl_format_new (
+    babl_model ("Y'"),
+    babl_type ("float"),
+    babl_component ("Y'"),
+    NULL);
+  const Babl *yHalf_gamma = babl_format_new (
+    babl_model ("Y'"),
+    babl_type ("half"),
+    babl_component ("Y'"),
+    NULL);
+
+#define CONV(src, dst) \
+{ \
+  babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
+  babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \
+}
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1) &&
+      (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_F16C))
+    {
+      CONV(rgbaHalf, rgbaF);
+      CONV(rgbHalf,  rgbF);
+      CONV(yaHalf,   yaF);
+      CONV(yHalf,    yF);
+      CONV(rgbaF,    rgbaHalf);
+      CONV(rgbF,     rgbHalf);
+      CONV(yaF,      yaHalf);
+      CONV(yF,       yHalf);
+    }
+
+#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */
+
+  return 0;
+}
+
-- 
2.30.2